"""
In this file we preprocess the row data and split in into training data and test data.
And the ratio of training in row data is roughly nine of ten. And their data type is:
numpy array of list with two items (num, 2), the first is the lyric string, and the  
second is the corresponding label. And we save the list under ~/data/
"""

import random
import csv
import numpy as np
from utils import *

# Get the labels
types = ['gufeng', 'liuxing', 'minyao', 'shuochang']
labels = dict((t, i) for i, t in enumerate(types))

# Initialize the dataset.
train = []
val = []
test = []

# Load the row data and preprocess.
for name in types:
    with open('data/' + name + '.csv', encoding='utf-8') as f:
        rows = csv.reader(f)
        for r in rows:
            r = r[0] # string
            r = clear_str(r)
            random_score = random.random()
            if random_score < 0.8:
                train.append([r, labels[name]])
            elif random_score < 0.9:
                val.append([r, labels[name]])
            else:
                test.append([r, labels[name]])
                
# Save trainval and test dataset
train = np.array(train)
val = np.array(val)
test = np.array(test)
np.save('data/train.npy', train)
np.save('data/val.npy', val)
np.save('data/test.npy', test)
